Review Midterm

Author

Hermann Junhao Fan

The questions will be the same as the sample exam, but the dataset is eggs_price data-set in lab4’s folder.

egg_data <- read.csv("eggs_price_2025.csv")
colnames(egg_data) <- c("Date", "Price")
plot_ly(egg_data, x = ~as.Date(Date)) %>%
  add_lines(y = ~Price, name = 'Egg prices (USD)', line = list(color = 'blue')) %>%
  layout(title = "Time series plot in Egg Prices",
         xaxis = list(title = "Date"),
         yaxis = list(title = "Egg Prices"))
ts_egg <- ts(egg_data$Price, 
                start = decimal_date(as.Date(min(egg_data$Date))), 
                frequency = 12)  #Monthly dataa


plot(ts_egg, main = "Time Series plot")

Checking for stationary

f1 <- ggAcf(ts_egg, 50) + 
  ggtitle("ACF plot without differencing") + 
  theme_minimal()

f1

The ACF decays slowly to 0, which is a typical sign of non-stationary. The ADF Test below indicating the same.

tseries::adf.test(ts_egg)

    Augmented Dickey-Fuller Test

data:  ts_egg
Dickey-Fuller = -2.7064, Lag order = 8, p-value = 0.2793
alternative hypothesis: stationary

Lag plot:

gglagplot(ts_egg, do.lines = FALSE) +
  ggtitle("Lag Plot of Original Egg Prices") +
  theme_minimal()

From lag 1-4, we are able to see a clear autocorrelation.The lag plot does not show a consistent of autocorrelation.

require(gridExtra)
Loading required package: gridExtra

Attaching package: 'gridExtra'
The following object is masked from 'package:dplyr':

    combine
plot1<-autoplot(ts_egg, main="without the log transformation") 
plot2<-autoplot(log(ts_egg), main="log transformed data") 

grid.arrange(plot1, plot2,nrow=2)

There is no such big difference, so we do not need log transformation in this case.

tseries::adf.test(ts_egg)

    Augmented Dickey-Fuller Test

data:  ts_egg
Dickey-Fuller = -2.7064, Lag order = 8, p-value = 0.2793
alternative hypothesis: stationary

Not stationary, so take the differencing

fig1 <- ggAcf(ts_egg, 50) +
  ggtitle("ACF of egg's price time series") + 
  theme_minimal()

fig1

Non-stationary

library(patchwork)
Warning: package 'patchwork' was built under R version 4.2.3
diff <- diff(ts_egg)
fig_diff1 <- ggAcf(diff, 50) + 
  ggtitle("ACF For differenced egg price") + 
  theme_minimal() # q = 0, 1, 2
fig_diff2_pacf <- ggPacf(diff, 50) + 
  ggtitle("PACF For differenced egg price")+ 
  theme_minimal() # p = 0, 1, 2, 3

diff_2 <- diff(diff, lag = 12) # D = 1
fig_diff2_acf <- ggAcf(diff_2, 50) + 
  ggtitle("ACF for seasonal differencing")+
  theme_minimal() # Q = 1 or Q = 0

fig_diff2_pacf <- ggPacf(diff_2, 50) + 
  ggtitle("PACF for seasonal differencing") + 
  theme_minimal() # P = 1, 2, 3, 4
  
  
(fig_diff1 | fig_diff2_pacf) / (fig_diff2_acf | fig_diff2_pacf)

q = 0, 1, 2 = 1:2 p = 0, 1, 2, 3 = 1:3 d = 1 D = 1 Q = 0, 1 = 1:2 P = 1, 2, 3, 4 = 1:4

######################## Check for different combinations ########


#write a funtion
SARIMA.c=function(p1,p2,q1,q2,P1,P2,Q1,Q2,data){
  
  #K=(p2+1)*(q2+1)*(P2+1)*(Q2+1)
  
  temp=c()
  d=1
  D=1
  s=12
  
  i=1
  temp= data.frame()
  ls=matrix(rep(NA,9*35),nrow=35)
  
  
  for (p in p1:p2)
  {
    for(q in q1:q2)
    {
      for(P in P1:P2)
      {
        for(Q in Q1:Q2)
        {
          if(p+d+q+P+D+Q<=9)
          {
            
            model<- Arima(data,order=c(p-1,d,q-1),seasonal=c(P-1,D,Q-1))
            ls[i,]= c(p-1,d,q-1,P-1,D,Q-1,model$aic,model$bic,model$aicc)
            i=i+1
            #print(i)
            
          }
          
        }
      }
    }
    
  }
  
  
  temp= as.data.frame(ls)
  names(temp)= c("p","d","q","P","D","Q","AIC","BIC","AICc")
  
  temp
  
}

q = 0, 1, 2 = 1:2 p = 0, 1, 2, 3 = 1:3 d = 1 D = 1 Q = 0, 1 = 1:2 P = 1, 2, 3, 4 = 1:4

output=SARIMA.c(p1=1,p2=3,q1=1,q2=2,P1=1,P2=4,Q1=1,Q2=2,data=ts_egg)
#output

#knitr::kable(output)
output[which.min(output$AIC),] 
   p d q P D Q       AIC       BIC      AICc
22 2 1 0 0 1 1 -689.8555 -672.7867 -689.7789
output[which.min(output$BIC),]
   p d q P D Q       AIC       BIC      AICc
22 2 1 0 0 1 1 -689.8555 -672.7867 -689.7789
output[which.min(output$AICc),]
   p d q P D Q       AIC       BIC      AICc
22 2 1 0 0 1 1 -689.8555 -672.7867 -689.7789

It find out ARIMA(2, 1, 0)(0, 1, 1)[12] is the best model.

auto.arima(ts_egg)
Series: ts_egg 
ARIMA(4,1,1)(0,0,2)[12] with drift 

Coefficients:
         ar1     ar2      ar3      ar4      ma1    sma1    sma2   drift
      0.9745  0.0595  -0.0021  -0.1609  -0.9473  0.1057  0.1498  0.0045
s.e.  0.0506  0.0609   0.0617   0.0459   0.0297  0.0489  0.0572  0.0028

sigma^2 = 0.0155:  log likelihood = 361.64
AIC=-705.27   AICc=-704.93   BIC=-666.66

Auto ARIMA THINKs ARIMA(4, 1, 1)(0, 0, 2)[12] is the best model.

Diagnose:

# Set seed for reproducibility
set.seed(123)

# ------------------- Fit SARIMA Models & Extract Key Output Dynamically -------------------

# Model 1: SARIMA(2,1,0)(0,1,1)[12]
model_output1 <- capture.output(sarima(ts_egg, 2,1,0, 0,1,1,12))

# Find the relevant line numbers dynamically based on the keyword "Coefficients"
start_line1 <- grep("Coefficients", model_output1)
end_line1 <- length(model_output1)  

# Print the relevant section automatically
cat(model_output1[start_line1:end_line1], sep = "\n")
Coefficients: 
     Estimate     SE  t.value p.value
ar1    0.0590 0.0434   1.3601  0.1744
ar2    0.1413 0.0436   3.2421  0.0013
sma1  -0.9292 0.0237 -39.1341  0.0000

sigma^2 estimated as 0.01488395 on 524 degrees of freedom 
 
AIC = -1.309024  AICc = -1.308937  BIC = -1.276635 
 
# ------------------- Model 2: SARIMA(4,1,1)(0,0,2)[12] -------------------

model_output2 <- capture.output(sarima(ts_egg, 4,1,1, 0,0,2,12))

# Find the relevant line numbers dynamically based on the keyword "Coefficients"
start_line2 <- grep("Coefficients", model_output2)
end_line2 <- length(model_output2)  

# Print the relevant section automatically
cat(model_output2[start_line2:end_line2], sep = "\n")
Coefficients: 
         Estimate     SE  t.value p.value
ar1        0.9745 0.0506  19.2578  0.0000
ar2        0.0595 0.0609   0.9762  0.3294
ar3       -0.0021 0.0617  -0.0340  0.9729
ar4       -0.1609 0.0459  -3.5025  0.0005
ma1       -0.9473 0.0297 -31.9322  0.0000
sma1       0.1057 0.0489   2.1644  0.0309
sma2       0.1498 0.0572   2.6177  0.0091
constant   0.0045 0.0028   1.6052  0.1090

sigma^2 estimated as 0.01527376 on 531 degrees of freedom 
 
AIC = -1.30848  AICc = -1.307976  BIC = -1.236852 
 

ARIMA(2, 1, 0)(0, 1, 1)[12] is the best, since:

  • The Residual Plot shows nearly consistent fluctuation around zero, however, after 2020, the residuals fluctate more than the previous years, but still around 0, we could say that the residual is stationary.

  • The Autocorrelation Function (ACF) of the residuals reveals significant autocorrelations around lag 1.5 and high order lags, which are too high to include in the model.

  • The Q-Q Plot indicates that the residuals follow a near-normal distribution, with minor deviations at the tails, which is typical in time series data.

  • The P-value plots showing lots of insignificance from lag = 0 to lag = 16. AR2, AR3 are not significant in this case.

# ------------------- Fit SARIMA Models & Extract Key Output Dynamically -------------------

# Model 1: SARIMA(2,1,0)(0,1,1)[12]
model_output1 <- capture.output(sarima(ts_egg, 2,1,0, 0,1,1,12))

# Find the relevant line numbers dynamically based on the keyword "Coefficients"
start_line1 <- grep("Coefficients", model_output1)
end_line1 <- length(model_output1)  

# Print the relevant section automatically
cat(model_output1[start_line1:end_line1], sep = "\n")
Coefficients: 
     Estimate     SE  t.value p.value
ar1    0.0590 0.0434   1.3601  0.1744
ar2    0.1413 0.0436   3.2421  0.0013
sma1  -0.9292 0.0237 -39.1341  0.0000

sigma^2 estimated as 0.01488395 on 524 degrees of freedom 
 
AIC = -1.309024  AICc = -1.308937  BIC = -1.276635 
 

Fit the model:

  • The Residual Plot shows nearly consistent fluctuation around zero, however, after 2020, the residuals fluctate more than the previous years, but still around 0, we could say that the residual is stationary.

  • The Autocorrelation Function (ACF) of the residuals reveals significant autocorrelations around lag 0 and high order lags, which are too high to include in the model.

  • The Q-Q Plot indicates that the residuals follow a near-normal distribution, with minor deviations at the tails, which is typical in time series data.

  • except ar1 is not significant, all other parameters are significant.

fit <- Arima(ts_egg, order=c(2,1,0), seasonal=list(order=c(0,1,1),period=12))
summary(fit)
Series: ts_egg 
ARIMA(2,1,0)(0,1,1)[12] 

Coefficients:
         ar1     ar2     sma1
      0.0590  0.1413  -0.9292
s.e.  0.0434  0.0436   0.0237

sigma^2 = 0.01497:  log likelihood = 348.93
AIC=-689.86   AICc=-689.78   BIC=-672.79

Training set error measures:
                      ME      RMSE        MAE         MPE     MAPE      MASE
Training set 0.003310636 0.1205224 0.07092409 -0.03535937 4.723159 0.2889723
                    ACF1
Training set -0.01120366
library(ggplot2)
library(forecast)
library(tsibble)
Registered S3 method overwritten by 'tsibble':
  method               from 
  as_tibble.grouped_df dplyr

Attaching package: 'tsibble'
The following object is masked from 'package:zoo':

    index
The following object is masked from 'package:lubridate':

    interval
The following objects are masked from 'package:base':

    intersect, setdiff, union
library(dplyr)

# Convert ts_egg to a data frame with proper dates
start_date <- as.Date("1980-01-01")  # Adjust based on actual data
date_seq <- seq(start_date, by = "month", length.out = length(ts_egg))

ts_egg_df <- data.frame(
  Date = date_seq,
  Price = as.numeric(ts_egg)
)

# Fit ARIMA model
fit <- auto.arima(ts_egg)

# Generate forecasts
mean_forecast <- forecast(meanf(ts_egg, h = 12))
naive_forecast <- forecast(naive(ts_egg, h = 12))
snaive_forecast <- forecast(snaive(ts_egg, h = 12))
drift_forecast <- forecast(rwf(ts_egg, h = 12, drift = TRUE))
arima_forecast <- forecast(fit, h = 12)

# Convert forecasts to data frame
# Extract forecasted values properly
forecast_df <- data.frame(
  Date = seq(max(ts_egg_df$Date) %m+% months(1), by = "month", length.out = 12),
  Mean = as.numeric(mean_forecast$mean),    # Extract mean forecast
  Naive = as.numeric(naive_forecast$mean),
  SNaive = as.numeric(snaive_forecast$mean),
  Drift = as.numeric(drift_forecast$mean),
  ARIMA = as.numeric(arima_forecast$mean)
)
# Plot with ggplot2
ggplot(ts_egg_df, aes(x = Date, y = Price)) +
  geom_line(color = "black", size = 1) +  # Actual Data
  geom_line(data = forecast_df, aes(x = Date, y = Mean, color = "Mean"), size = 1, linetype = "dashed") +
  geom_line(data = forecast_df, aes(x = Date, y = Naive, color = "Naïve"), size = 1, linetype = "dotted") +
  geom_line(data = forecast_df, aes(x = Date, y = SNaive, color = "Seasonal Naïve"), size = 1, linetype = "dotdash") +
  geom_line(data = forecast_df, aes(x = Date, y = Drift, color = "Drift"), size = 1, linetype = "twodash") +
  geom_line(data = forecast_df, aes(x = Date, y = ARIMA, color = "ARIMA Fit"), size = 1) +
  ggtitle("Egg Price Forecast") +
  xlab("Year") + ylab("Egg Price") +
  scale_color_manual(values = c("Mean" = "blue", "Naïve" = "red", 
                                "Seasonal Naïve" = "green", "Drift" = "orange", 
                                "ARIMA Fit" = "purple")) +
  guides(colour = guide_legend(title = "Forecast Methods")) +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

accuracy(snaive(ts_egg, h=12))
                     ME      RMSE       MAE       MPE     MAPE MASE      ACF1
Training set 0.05290152 0.4430971 0.2454356 0.7127024 15.07352    1 0.9127782
accuracy(fit) # Fit is better
                       ME      RMSE        MAE        MPE    MAPE      MASE
Training set -0.000116345 0.1234726 0.07603313 -0.5693686 5.16475 0.3097885
                      ACF1
Training set -0.0008840435
fit %>% forecast(h=36) %>% autoplot() #next 3 years

sarima.for(ts_egg, 36, 2,1,0,0,1,1,12)

$pred
          Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
2025 4.234900 4.275501 4.230793 4.265863 4.151832 4.091748 4.126341 4.176166
2026 4.480939 4.472025 4.416975 4.444440 4.328499 4.267228 4.301480 4.351118
2027 4.655785 4.646871 4.591820 4.619285 4.503345 4.442073 4.476326 4.525963
          Sep      Oct      Nov      Dec
2025 4.250964 4.231531 4.276110 4.444554
2026 4.425857 4.406394 4.450963 4.619402
2027 4.600702 4.581239 4.625808 4.794247

$se
           Jan       Feb       Mar       Apr       May       Jun       Jul
2025 0.1220128 0.1777180 0.2305567 0.2744788 0.3135397 0.3484406 0.3803092
2026 0.5357042 0.5595878 0.5828422 0.6052431 0.6268924 0.6478266 0.6681120
2027 0.7806826 0.7995263 0.8182012 0.8364897 0.8544247 0.8719971 0.8892279
           Aug       Sep       Oct       Nov       Dec
2025 0.4097385 0.4372126 0.4630646 0.4875507 0.5108653
2026 0.6878008 0.7069425 0.7255796 0.7437500 0.7614868
2027 0.9061322 0.9227277 0.9390302 0.9550545 0.9708141